// Copyright 2022 The Google Research Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <fstream>
#include <iostream>

#include "test/common/Utils.h"

#ifdef GTEST
#include "gtest/gtest.h"
#endif

#include "file/base/path.h"
#include "third_party/cnpy/cnpy.h"
#include "src/ArchitectureParams.h"
#include "test/common/AcceleratorHarness.h"
#include "test/common/GoldModel.h"
#include "test/common/Utils.h"
#include "tensorflow/core/platform/types.h"

#ifdef GTEST
class AttentionTest : public testing::Test {
 protected:
  INPUT_DATATYPE* mainMemory;

  AttentionTest() { mainMemory = new INPUT_DATATYPE[128 * 1024 * 1024]; }
  ~AttentionTest() { delete mainMemory; }
};
#endif

#ifdef GTEST
TEST_F(AttentionTest, QueryProjection) {
#else
void QueryProjection(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "QUERY PROJECTION" << std::endl;
  std::cout << "----------------" << std::endl << std::endl;

  constexpr int NUM_HEADS = 1;
  constexpr int BATCH_SIZE = 1;
  constexpr int MAX_SEQ_LEN = 512;
  constexpr int HIDDEN_SIZE = 1024;
  constexpr int ATTENTION_SIZE = HIDDEN_SIZE / NUM_HEADS;

  // (512x1024) * (1024x1024)
  // M = 512
  // N = 1024
  // P = 1024
  // n1 = 1024 / DIM
  // p1 = 4
  // p2 = 1024 / 4 / DIM
  const Params params = {
      16,                        // M0
      4,                         // P1
      1024 / DIMENSION,          // N1
      32,                        // M1
      1024 / 4 / DIMENSION,      // P2
      0,                         // INPUT_OFFSET
      512 * 1024,                // WEIGHT_OFFSET
      512 * 1024 + 1024 * 1024,  // OUTPUT_OFFSET
      false,                     // SOFTMAX
      1,                         // SCALE
      false,                     // TRANSPOSE
      0,                         // VECTOR_OFFSET
      false,                     // VEC_OP
      false,                     // VEC_SUB
      false,                     // VEC_SQUARE
      false,                     // VEC_REDUCE
      true,                      // CONST_SCALE
      0,                         // VEC_SCALE_OFFSET
      0                          // VEC_SUB_OFFSET

  };

  // 1x512x1024
  INPUT_DATATYPE* matrixA =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE];
  load_array_with_random(matrixA, BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE);
  std::memcpy(&mainMemory[params.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE);

  // 1024x1x1024
  INPUT_DATATYPE* matrixB =
      new INPUT_DATATYPE[HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE];
  load_array_with_random(matrixB, HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);
  std::memcpy(
      &mainMemory[params.WEIGHT_OFFSET], matrixB,
      sizeof(INPUT_DATATYPE) * HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);

  // 1x512x1x1024
  OUTPUT_DATATYPE* matrixC = new OUTPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN *
                                                 NUM_HEADS * ATTENTION_SIZE];
  run_op(params, mainMemory);
  run_gold_op(params, matrixA, matrixB, matrixC);
  std::cout << "Accelerator vs Gold Model" << std::endl;
  compare_arrays(&mainMemory[params.OUTPUT_OFFSET], matrixC,
                 BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

#ifdef GTEST
TEST_F(AttentionTest, KeyProjection) {
#else
void KeyProjection(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "KEY PROJECTION" << std::endl;
  std::cout << "--------------" << std::endl << std::endl;
  constexpr int NUM_HEADS = 1;
  constexpr int BATCH_SIZE = 1;
  constexpr int MAX_SEQ_LEN = 512;
  constexpr int HIDDEN_SIZE = 1024;
  constexpr int ATTENTION_SIZE = HIDDEN_SIZE / NUM_HEADS;

  // (512x1024) * (1024x1024)
  // M = 512
  // N = 1024
  // P = 1024
  // n1 = 1024 / DIM
  // p1 = 4
  // p2 = 1024 / 4 / DIM
  const Params params = {
      16,                        // M0
      4,                         // P1
      1024 / DIMENSION,          // N1
      32,                        // M1
      1024 / 4 / DIMENSION,      // P2
      0,                         // INPUT_OFFSET
      512 * 1024,                // WEIGHT_OFFSET
      512 * 1024 + 1024 * 1024,  // OUTPUT_OFFSET
      false,                     // SOFTMAX
      1,                         // SCALE
      false,                     // TRANSPOSE
      0,                         // VECTOR_OFFSET
      false,                     // VEC_OP
      false,                     // VEC_SUB
      false,                     // VEC_SQUARE
      false,                     // VEC_REDUCE
      true,                      // CONST_SCALE
      0,                         // VEC_SCALE_OFFSET
      0                          // VEC_SUB_OFFSET
  };

  // 1x512x1024
  INPUT_DATATYPE* matrixA =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE];
  load_array_with_random(matrixA, BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE);
  std::memcpy(&mainMemory[params.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE);

  // 1024x1x1024
  INPUT_DATATYPE* matrixB =
      new INPUT_DATATYPE[HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE];
  load_array_with_random(matrixB, HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);
  std::memcpy(
      &mainMemory[params.WEIGHT_OFFSET], matrixB,
      sizeof(INPUT_DATATYPE) * HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);

  // 1x512x1x1024
  OUTPUT_DATATYPE* matrixC = new OUTPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN *
                                                 NUM_HEADS * ATTENTION_SIZE];

  run_op(params, mainMemory);
  run_gold_op(params, matrixA, matrixB, matrixC);
  std::cout << "Accelerator vs Gold Model" << std::endl;
  compare_arrays(&mainMemory[params.OUTPUT_OFFSET], matrixC,
                 BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

#ifdef GTEST
TEST_F(AttentionTest, ValueProjection) {
#else
void ValueProjection(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "VALUE PROJECTION" << std::endl;
  std::cout << "----------------" << std::endl << std::endl;

  constexpr int NUM_HEADS = 1;
  constexpr int BATCH_SIZE = 1;
  constexpr int MAX_SEQ_LEN = 512;
  constexpr int HIDDEN_SIZE = 1024;
  constexpr int ATTENTION_SIZE = HIDDEN_SIZE / NUM_HEADS;

  // (512x1024) * (1024x1024)
  // M = 512
  // N = 1024
  // P = 1024
  // n1 = 1024 / DIM
  // p1 = 4
  // p2 = 1024 / 4 / DIM
  const Params params = {
      16,                        // M0
      4,                         // P1
      1024 / DIMENSION,          // N1
      32,                        // M1
      1024 / 4 / DIMENSION,      // P2
      0,                         // INPUT_OFFSET
      512 * 1024,                // WEIGHT_OFFSET
      512 * 1024 + 1024 * 1024,  // OUTPUT_OFFSET
      false,                     // SOFTMAX
      1,                         // SCALE
      false,                     // TRANSPOSE
      0,                         // VECTOR_OFFSET
      false,                     // VEC_OP
      false,                     // VEC_SUB
      false,                     // VEC_SQUARE
      false,                     // VEC_REDUCE
      true,                      // CONST_SCALE
      0,                         // VEC_SCALE_OFFSET
      0                          // VEC_SUB_OFFSET
  };

  // 1x512x1024
  INPUT_DATATYPE* matrixA =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE];
  load_array_with_random(matrixA, BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE);
  std::memcpy(&mainMemory[params.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE);

  // 1024x1x1024
  INPUT_DATATYPE* matrixB =
      new INPUT_DATATYPE[HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE];
  load_array_with_random(matrixB, HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);
  std::memcpy(
      &mainMemory[params.WEIGHT_OFFSET], matrixB,
      sizeof(INPUT_DATATYPE) * HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);

  // 1x512x1x1024
  OUTPUT_DATATYPE* matrixC = new OUTPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN *
                                                 NUM_HEADS * ATTENTION_SIZE];

  run_op(params, mainMemory);
  run_gold_op(params, matrixA, matrixB, matrixC);
  std::cout << "Accelerator vs Gold Model" << std::endl;
  compare_arrays(&mainMemory[params.OUTPUT_OFFSET], matrixC,
                 BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

#ifdef GTEST
TEST_F(AttentionTest, AttentionScore) {
#else
void AttentionScore(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "Attention Score" << std::endl;
  std::cout << "---------------" << std::endl;
  constexpr int NUM_HEADS = 1;
  constexpr int BATCH_SIZE = 1;
  constexpr int MAX_SEQ_LEN = 512;
  constexpr int HIDDEN_SIZE = 1024;
  constexpr int ATTENTION_SIZE = HIDDEN_SIZE / NUM_HEADS;

  // (512x1024) * (512x1024)T
  // (512x1024) * (1024x512)T
  // M = 512
  // N = 1024
  // P = 512
  // n1 = 1024 / DIM
  // p1 = 4
  // p2 = 512 / 4 / DIM
  const Params params = {
      16,                        // M0
      4,                         // P1
      1024 / DIMENSION,          // N1
      32,                        // M1
      512 / 4 / DIMENSION,       // P2
      0,                         // INPUT_OFFSET
      512 * 1024,                // WEIGHT_OFFSET
      512 * 1024 + 1024 * 1024,  // OUTPUT_OFFSET
      false,                     // SOFTMAX
      1,                         // SCALE
      true,                      // TRANSPOSE
      0,                         // VECTOR_OFFSET
      false,                     // VEC_OP
      false,                     // VEC_SUB
      false,                     // VEC_SQUARE
      false,                     // VEC_REDUCE
      true,                      // CONST_SCALE
      0,                         // VEC_SCALE_OFFSET
      0                          // VEC_SUB_OFFSET
  };

  // 1x512x1x1024
  INPUT_DATATYPE* matrixA =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE];
  load_array_with_random(matrixA,
                         BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE);
  std::memcpy(&mainMemory[params.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * HIDDEN_SIZE);

  // 1x512x1x1024
  INPUT_DATATYPE* matrixB =
      new INPUT_DATATYPE[BATCH_SIZE * HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE];
  load_array_with_random(matrixB,
                         BATCH_SIZE * HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);
  std::memcpy(
      &mainMemory[params.WEIGHT_OFFSET], matrixB,
      sizeof(INPUT_DATATYPE) * HIDDEN_SIZE * NUM_HEADS * ATTENTION_SIZE);

  // 1x512x1x512
  OUTPUT_DATATYPE* matrixC =
      new OUTPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * MAX_SEQ_LEN];

  run_op(params, mainMemory);
  run_gold_op(params, matrixA, matrixB, matrixC);
  std::cout << "Accelerator vs Gold Model" << std::endl;
  compare_arrays(&mainMemory[params.OUTPUT_OFFSET], matrixC,
                 BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * MAX_SEQ_LEN);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

#ifdef GTEST
TEST_F(AttentionTest, Context) {
#else
void Context(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "Context" << std::endl;
  std::cout << "-------" << std::endl;
  constexpr int NUM_HEADS = 1;
  constexpr int BATCH_SIZE = 1;
  constexpr int MAX_SEQ_LEN = 512;
  constexpr int HIDDEN_SIZE = 1024;
  constexpr int ATTENTION_SIZE = HIDDEN_SIZE / NUM_HEADS;

  // (512x512) * (512x1024)
  // M = 512
  // N = 512
  // P = 1024
  // n1 = 512 / DIM
  // p1 = 4
  // p2 = 1024 / 4 / DIM
  const Params params = {
      16,                        // M0
      4,                         // P1
      512 / DIMENSION,           // N1
      32,                        // M1
      1024 / 4 / DIMENSION,      // P2
      0,                         // INPUT_OFFSET
      512 * 1024,                // WEIGHT_OFFSET
      512 * 1024 + 1024 * 1024,  // OUTPUT_OFFSET
      false,                     // SOFTMAX
      1,                         // SCALE
      false,                     // TRANSPOSE
      0,                         // VECTOR_OFFSET
      false,                     // VEC_OP
      false,                     // VEC_SUB
      false,                     // VEC_SQUARE
      false,                     // VEC_REDUCE
      true,                      // CONST_SCALE
      0,                         // VEC_SCALE_OFFSET
      0                          // VEC_SUB_OFFSET
  };

  // 1x1x512x512
  INPUT_DATATYPE* matrixA =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * MAX_SEQ_LEN];
  load_array_with_random(matrixA,
                         BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * MAX_SEQ_LEN);
  std::memcpy(&mainMemory[params.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                  MAX_SEQ_LEN);

  // 1x512x1x1024
  INPUT_DATATYPE* matrixB =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE];
  load_array_with_random(matrixB,
                         BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE);
  std::memcpy(&mainMemory[params.WEIGHT_OFFSET], matrixB,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                  ATTENTION_SIZE);

  // 1x512x1x1024
  OUTPUT_DATATYPE* matrixC = new OUTPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN *
                                                 NUM_HEADS * ATTENTION_SIZE];

  run_op(params, mainMemory);
  run_gold_op(params, matrixA, matrixB, matrixC);
  std::cout << "Accelerator vs Gold Model" << std::endl;
  compare_arrays(&mainMemory[params.OUTPUT_OFFSET], matrixC,
                 BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

#ifdef GTEST
TEST_F(AttentionTest, FeedForwardIntermediate) {
#else
void FeedForwardIntermediate(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "Feed Forward Intermediate" << std::endl;
  std::cout << "-------------------------" << std::endl;
  constexpr int NUM_HEADS = 1;
  constexpr int BATCH_SIZE = 1;
  constexpr int MAX_SEQ_LEN = 512;
  constexpr int HIDDEN_SIZE = 1024;
  constexpr int ATTENTION_SIZE = HIDDEN_SIZE / NUM_HEADS;
  constexpr int FEED_FORWARD_INTERMEDIATE_SIZE = 4096;

  // (512x1024) * (1024x4096)
  // M = 512
  // N = 1024
  // P = 4096
  // n1 = 512 / DIM
  // p1 = 4
  // p2 = 1024 / 4 / DIM
  const Params params = {
      16,                        // M0
      4,                         // P1
      1024 / DIMENSION,          // N1
      32,                        // M1
      4096 / 4 / DIMENSION,      // P2
      0,                         // INPUT_OFFSET
      512 * 1024,                // WEIGHT_OFFSET
      512 * 1024 + 4096 * 1024,  // OUTPUT_OFFSET
      false,                     // SOFTMAX
      1,                         // SCALE
      false,                     // TRANSPOSE
      0,                         // VECTOR_OFFSET
      false,                     // VEC_OP
      false,                     // VEC_SUB
      false,                     // VEC_SQUARE
      false,                     // VEC_REDUCE
      true,                      // CONST_SCALE
      0,                         // VEC_SCALE_OFFSET
      0                          // VEC_SUB_OFFSET
  };

  // 1x512x1x1024
  INPUT_DATATYPE* matrixA =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE];
  load_array_with_random(matrixA,
                         BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * ATTENTION_SIZE);
  std::memcpy(&mainMemory[params.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                  ATTENTION_SIZE);

  // 1024x4096
  INPUT_DATATYPE* matrixB =
      new INPUT_DATATYPE[ATTENTION_SIZE * FEED_FORWARD_INTERMEDIATE_SIZE];
  load_array_with_random(matrixB,
                         ATTENTION_SIZE * FEED_FORWARD_INTERMEDIATE_SIZE);
  std::memcpy(
      &mainMemory[params.WEIGHT_OFFSET], matrixB,
      sizeof(INPUT_DATATYPE) * ATTENTION_SIZE * FEED_FORWARD_INTERMEDIATE_SIZE);

  // 1x512x4096
  OUTPUT_DATATYPE* matrixC =
      new OUTPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                          FEED_FORWARD_INTERMEDIATE_SIZE];

  run_op(params, mainMemory);
  run_gold_op(params, matrixA, matrixB, matrixC);
  std::cout << "Accelerator vs Gold Model" << std::endl;
  compare_arrays(
      &mainMemory[params.OUTPUT_OFFSET], matrixC,
      BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * FEED_FORWARD_INTERMEDIATE_SIZE);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

#ifdef GTEST
TEST_F(AttentionTest, FeedForwardFinal) {
#else
void FeedForwardFinal(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "Feed Forward Final" << std::endl;
  std::cout << "------------------" << std::endl;
  constexpr int NUM_HEADS = 1;
  constexpr int BATCH_SIZE = 1;
  constexpr int MAX_SEQ_LEN = 512;
  // constexpr int HIDDEN_SIZE = 1024;
  // constexpr int ATTENTION_SIZE = HIDDEN_SIZE / NUM_HEADS;
  constexpr int FEED_FORWARD_INTERMEDIATE_SIZE = 4096;
  constexpr int FEED_FORWARD_FINAL_SIZE = 1024;

  // (512x4096) * (4096x1024)
  // M = 512
  // N = 4096
  // P = 1024
  // n1 = 512 / DIM
  // p1 = 4
  // p2 = 1024 / 4 / DIM
  const Params params = {
      16,                        // M0
      4,                         // P1
      4096 / DIMENSION,          // N1
      32,                        // M1
      1024 / 4 / DIMENSION,      // P2
      0,                         // INPUT_OFFSET
      512 * 4096,                // WEIGHT_OFFSET
      512 * 4096 + 4096 * 1024,  // OUTPUT_OFFSET
      false,                     // SOFTMAX
      1,                         // SCALE
      false,                     // TRANSPOSE
      0,                         // VECTOR_OFFSET
      false,                     // VEC_OP
      false,                     // VEC_SUB
      false,                     // VEC_SQUARE
      false,                     // VEC_REDUCE
      true,                      // CONST_SCALE
      0,                         // VEC_SCALE_OFFSET
      0                          // VEC_SUB_OFFSET
  };

  // 1x512x1x4096
  INPUT_DATATYPE* matrixA =
      new INPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                         FEED_FORWARD_INTERMEDIATE_SIZE];
  load_array_with_random(matrixA, BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                                      FEED_FORWARD_INTERMEDIATE_SIZE);
  std::memcpy(&mainMemory[params.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                  FEED_FORWARD_INTERMEDIATE_SIZE);

  // 4096x1024
  INPUT_DATATYPE* matrixB = new INPUT_DATATYPE[FEED_FORWARD_INTERMEDIATE_SIZE *
                                               FEED_FORWARD_FINAL_SIZE];
  load_array_with_random(
      matrixB, FEED_FORWARD_INTERMEDIATE_SIZE * FEED_FORWARD_FINAL_SIZE);
  std::memcpy(&mainMemory[params.WEIGHT_OFFSET], matrixB,
              sizeof(INPUT_DATATYPE) * FEED_FORWARD_INTERMEDIATE_SIZE *
                  FEED_FORWARD_FINAL_SIZE);

  // 1x512x1x4096
  OUTPUT_DATATYPE* matrixC =
      new OUTPUT_DATATYPE[BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS *
                          FEED_FORWARD_FINAL_SIZE];

  run_op(params, mainMemory);
  run_gold_op(params, matrixA, matrixB, matrixC);
  std::cout << "Accelerator vs Gold Model" << std::endl;
  compare_arrays(
      &mainMemory[params.OUTPUT_OFFSET], matrixC,
      BATCH_SIZE * MAX_SEQ_LEN * NUM_HEADS * FEED_FORWARD_FINAL_SIZE);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

#ifdef GTEST
TEST_F(AttentionTest, LayerNormalization) {
#else
void LayerNormalization(INPUT_DATATYPE* mainMemory) {
#endif
  std::cout << "LAYER NORMALIZATION" << std::endl;
  std::cout << "-------------------" << std::endl;
  constexpr int MAX_SEQ_LEN = 512;
  constexpr int HIDDEN_SIZE = 1024;

  std::cout << "Calculating mean..." << std::endl;
  // compute mean
  // 512 x 1024
  const Params meanParams = {
      16,                // M0
      1024 / DIMENSION,  // P1
      1,                 // N1
      512 / 16,          // M1
      1,                 // P2
      0,                 // INPUT_OFFSET
      0,                 // WEIGHT_OFFSET
      512 * 1024,        // OUTPUT_OFFSET
      false,             // SOFTMAX
      HIDDEN_SIZE,       // SCALE
      false,             // TRANSPOSE
      0,                 // VECTOR_OFFSET
      true,              // VEC_OP
      false,             // VEC_SUB
      false,             // VEC_SQUARE
      true,              // VEC_REDUCE
      true,              // CONST_SCALE
      0,                 // VEC_SCALE_OFFSET
      0                  // VEC_SUB_OFFSET
  };

  INPUT_DATATYPE* matrixA = new INPUT_DATATYPE[MAX_SEQ_LEN * HIDDEN_SIZE];
  load_array_with_random(matrixA, MAX_SEQ_LEN * HIDDEN_SIZE);
  std::memcpy(&mainMemory[meanParams.INPUT_OFFSET], matrixA,
              sizeof(INPUT_DATATYPE) * MAX_SEQ_LEN * HIDDEN_SIZE);

  INPUT_DATATYPE* matrixB = new INPUT_DATATYPE[MAX_SEQ_LEN];

  INPUT_DATATYPE* matrixC = new INPUT_DATATYPE[MAX_SEQ_LEN];

  run_op(meanParams, mainMemory);
  run_gold_op(meanParams, matrixA, matrixA, matrixB);

  // check mean calculation
  compare_arrays(&mainMemory[meanParams.OUTPUT_OFFSET], matrixB, MAX_SEQ_LEN);

  std::cout << "Calculating variance..." << std::endl;
  const Params varianceParams = {
      16,                // M0
      1024 / DIMENSION,  // P1
      1,                 // N1
      512 / 16,          // M1
      1,                 // P2
      0,                 // INPUT_OFFSET
      0,                 // WEIGHT_OFFSET
      512 * 2048,        // OUTPUT_OFFSET
      false,             // SOFTMAX
      HIDDEN_SIZE,       // SCALE
      false,             // TRANSPOSE
      0,                 // VECTOR_OFFSET
      true,              // VEC_OP
      true,              // VEC_SUB
      true,              // VEC_SQUARE
      true,              // VEC_REDUCE
      true,              // CONST_SCALE
      0,                 // VEC_SCALE_OFFSET
      512 * 1024         // VEC_SUB_OFFSET
  };

  run_op(varianceParams, mainMemory);
  run_gold_op(varianceParams, matrixA, matrixB, matrixC);

  // check variance calculation
  compare_arrays(&mainMemory[varianceParams.OUTPUT_OFFSET], matrixC,
                 MAX_SEQ_LEN);

  std::cout << "Normalizing..." << std::endl;
  const Params normParams = {
      16,                // M0
      1024 / DIMENSION,  // P1
      1,                 // N1
      512 / 16,          // M1
      1,                 // P2
      0,                 // INPUT_OFFSET
      0,                 // WEIGHT_OFFSET
      512 * 4096,        // OUTPUT_OFFSET
      false,             // SOFTMAX
      HIDDEN_SIZE,       // SCALE
      false,             // TRANSPOSE
      0,                 // VECTOR_OFFSET
      true,              // VEC_OP
      true,              // VEC_SUB
      false,             // VEC_SQUARE
      false,             // VEC_REDUCE
      false,             // CONST_SCALE
      512 * 2048,        // VEC_SCALE_OFFSET
      512 * 1024         // VEC_SUB_OFFSET
  };

  run_op(normParams, mainMemory);
  run_gold_op(normParams, matrixA, matrixB, matrixC);

  compare_arrays(&mainMemory[normParams.OUTPUT_OFFSET], matrixA,
                 MAX_SEQ_LEN * HIDDEN_SIZE);

  delete[] matrixA;
  delete[] matrixB;
  delete[] matrixC;
}

int sc_main(int argc, char* argv[]) {
  INPUT_DATATYPE* mainMemory = new INPUT_DATATYPE[128 * 1024 * 1024];

  // run test depending on environment variable
  char* value = std::getenv("ATTENTION_TEST");
  std::string testName("");
  if (value) {
    testName = std::string(value);
  }

  std::map<std::string, void (*)(INPUT_DATATYPE*)> testMap = {
      {"QueryProjection", QueryProjection},
      {"KeyProjection", KeyProjection},
      {"ValueProjection", ValueProjection},
      {"AttentionScore", AttentionScore},
      {"Context", Context},
      {"FeedForwardIntermediate", FeedForwardIntermediate},
      {"FeedForwardFinal", FeedForwardFinal},
      {"LayerNormalization", LayerNormalization},
  };

  auto testFunctionIterator = testMap.find(testName);

  if (testFunctionIterator != testMap.end()) {
    (testFunctionIterator->second)(mainMemory);
  } else {
    std::cout
        << "Please set the ATTENTION_TEST environment variable to one of: "
        << std::endl;
    for (const auto& pair : testMap) {
      std::cout << "\t - " << pair.first << std::endl;
    }
  }

  return 0;

#endif
}
